Load R packages and define colour functions

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(knitr) ; library(kableExtra)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

# Get colors from the ggplot palette
gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
  
  n = length(unique(mods))-1
  set.seed(123) ; rand_order = sample(1:n)
  mod_colors = c('white', gg_colour_hue(n)[rand_order])
  names(mod_colors) = mods %>% table %>% names
  
  return(mod_colors)
}

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame

# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')

# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>% 
             left_join(datGenes %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
             dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
             left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))


rm(dds, WGCNA_metrics)




3.3.1 Top clusters by Cluster-diagnosis correlation


Selecting the top clusters

Modules with a cluster-diagnosis correlation magnitude larger than 0.7

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor) %>% distinct %>% 
            mutate(alpha = ifelse(abs(MTcor)>0.7, 1, 0.3))

top_modules = plot_data %>% arrange(desc(MTcor)) %>% filter(abs(MTcor) > 0.7) %>% pull(Module) %>% as.character

The 3 modules that fulfill this condition are clusters 6, 38, 63

ggplotly(plot_data %>% mutate(module_number=ifelse(module_number == max(module_number), 'No cluster', 
                                                   paste('Cluster', module_number))) %>%
         ggplot(aes(reorder(module_number, -MTcor), MTcor)) + 
         geom_bar(stat='identity', fill = plot_data$Module, alpha = plot_data$alpha) + 
         geom_hline(yintercept =c(0.7, -0.7), color = 'gray', linetype = 'dotted') + 
         xlab('Clusters')+ ylab('Cluster-diagnosis correlation') + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))

The modules consist mainly of points with high values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules

The genes belonging to the modules with the positive Module-Diagnosis correlation have positive LFC values.

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 38  186
   Cluster 6  61
   Cluster 63  219
   Others  12690
   #Total cases  13156
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control

In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2, p3)


Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score), 
  #                                                     paste('Score',as.character(gene.score))))) %>% 
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +  
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() + 
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 6 (MTcor = 0.74)
Gene Symbol MM GS SFARI Score Relevance
S100A10 0.8286804 0.7669233 Not in SFARI 0.7978018
C1R 0.7776892 0.6460530 Not in SFARI 0.7118711
SERPINA3 0.8029590 0.6126634 Not in SFARI 0.7078112
PLEKHA4 0.6370537 0.6873787 Not in SFARI 0.6622162
CHI3L1 0.6808357 0.6239026 Not in SFARI 0.6523691
TNFRSF1A 0.7280104 0.5689093 Not in SFARI 0.6484598
CD44 0.8148062 0.4657379 Not in SFARI 0.6402720
MARCH3 0.7004518 0.5588356 Not in SFARI 0.6296437
BST2 0.5722832 0.6073472 Not in SFARI 0.5898152
GBP1 0.6990798 0.4764819 Not in SFARI 0.5877808
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 63 (MTcor = 0.7)
Gene Symbol MM GS SFARI Score Relevance
CNN3 0.8312124 0.5670377 Neuronal 0.6991250
TCF7L1 0.6484687 0.6990050 Neuronal 0.6737368
SSR3 0.6841525 0.6498357 Not in SFARI 0.6669941
GAREM 0.8074868 0.5108372 Not in SFARI 0.6591620
FERMT2 0.7144697 0.5957060 Not in SFARI 0.6550878
GNG12 0.7281769 0.5769803 Not in SFARI 0.6525786
TMED5 0.7091431 0.5753787 Not in SFARI 0.6422609
TGFB2 0.6999958 0.5451410 Neuronal 0.6225684
DDAH2 0.7440530 0.4930602 Not in SFARI 0.6185566
BFAR 0.6455342 0.5803525 Not in SFARI 0.6129434
kable(top_genes[[3]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 38 (MTcor = 0.7)
Gene Symbol MM GS SFARI Score Relevance
PRKAR2A 0.7698334 0.7455345 Neuronal 0.7576840
SNTB2 0.7874213 0.6238400 Not in SFARI 0.7056306
NAA15 0.7745629 0.5968803 1 0.6857216
FAM63B 0.6561485 0.6868175 Not in SFARI 0.6714830
YAF2 0.7010083 0.6341459 Not in SFARI 0.6675771
LRRCC1 0.7584183 0.5177323 Not in SFARI 0.6380753
ZNF827 0.7071327 0.5346496 3 0.6208911
MECP2 0.7191566 0.5171252 1 0.6181409
TSHZ1 0.6201145 0.6137175 Not in SFARI 0.6169160
UBE3A 0.7751055 0.4536326 1 0.6143690
rm(create_table, i)

Most of the top genes, idenpendently of the cluster, have very high absolute values for the 2nd principal component

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


11/30 genes are differentially expressed

4/30 genes are SFARI Genes (ZNF827, NAA15, MECP2, UBE3A)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis),
                        by = c('variable'='Sample_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # For thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) + 
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)





3.3.2 Top clusters by enrichment in SFARI Genes


Selecting the top clusters

Using ORA, as it was done in the previous section and selecting as top clusters the ones with an adjusted p-value lower than \(10^{-3}\) (enrichment higher than 0.999)

# Calculate % and ORA of SFARI Genes in each module

#modules = unique(genes_info$Module[genes_info$Module!='gray']) %>% as.character
modules = unique(genes_info$Module) %>% as.character

# We need the entrez ID of the genes for this
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                       values=genes_info$ID, mart=mart) %>%
                 left_join(genes_info %>% dplyr::select(ID, Module,gene.score), by = c('ensembl_gene_id'='ID'))

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(gene.score == 'Others', 'Others', 'SFARI'), 
                                      'gene' = entrezgene) %>%  dplyr::select(term, gene) %>% distinct


enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
                             'pval_ORA' = 0, 'padj_ORA' = 0)

for(i in 1:length(modules)){
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>% 
                        data.frame %>% dplyr::select(-geneID,-Description)
  ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
  ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
  enrichment_data[i,-1] = c(length(genes_in_module), 
                            mean(genes_info$gene.score[genes_info$Module==module]!='Others'), 
                            ORA_pval, ORA_padj)
}

enrichment_data = enrichment_data %>% 
                  left_join(genes_info %>% dplyr::select(Module) %>% unique, by = 'Module')

genes_info = genes_info %>% left_join(enrichment_data, by = 'Module') 

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor, size, padj_ORA) %>% distinct %>% 
            mutate(alpha = ifelse(abs(1-padj_ORA)>0.999, 1, 0.3))

top_modules = plot_data %>% arrange(padj_ORA) %>% filter((1-padj_ORA) > 0.999) %>% pull(Module) %>% as.character

rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)

The 3 modules that fulfill this condition are clusters 22, 34, 46

ggplotly(plot_data %>% ggplot(aes(MTcor, padj_ORA, size=size)) + 
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=module_number)) +
         geom_hline(yintercept = 0.001, color = 'gray', linetype = 'dotted') + 
         xlab('Cluster-diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none') +
         ggtitle('Clusters Significantly Enriched in SFARI Genes'))
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 22  31
   Cluster 34  84
   Cluster 46  44
   Others  12997
   #Total cases  13156
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


The Eigengenes no longer separate well the behaviour between ASD and control.

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2, p3)



Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score),
  #                                                     paste('Score',as.character(gene.score))))) %>%
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal',
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() +
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.', gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 34 (MTcor = -0.37)
Gene Symbol MM GS SFARI Score Relevance
MARCH4 0.6489791 -0.5539480 Not in SFARI 0.6014636
KIF21B 0.8024920 -0.3811044 Not in SFARI 0.5917982
NDST1 0.7218706 -0.3809178 Neuronal 0.5513942
WSB2 0.7159101 -0.3411021 Not in SFARI 0.5285061
CX3CL1 0.6717155 -0.3373333 Not in SFARI 0.5045244
ELFN1 0.6370292 -0.3526992 Not in SFARI 0.4948642
CBLL1 0.6188781 -0.3666208 Not in SFARI 0.4927494
HMGCS1 0.6859951 -0.2946383 Not in SFARI 0.4903167
CYP26B1 0.5767439 -0.3875241 Not in SFARI 0.4821340
PTPN11 0.7275768 -0.2350739 1 0.4813253
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 46 (MTcor = 0.25)
Gene Symbol MM GS SFARI Score Relevance
KMT2A 0.6908155 0.4892386 1 0.5900270
TNRC6A 0.6498424 0.3783403 Neuronal 0.5140914
ZBTB16 0.7945128 0.2249534 3 0.5097331
SSBP3 0.6786660 0.3112656 Not in SFARI 0.4949658
CAMK1D 0.7277017 0.2598845 Neuronal 0.4937931
CNNM1 0.7767916 0.1871745 Not in SFARI 0.4819831
KCND3 0.6066690 0.3429314 3 0.4748002
ZBTB7A 0.7110965 0.2302252 Not in SFARI 0.4706609
NFIC 0.7153116 0.2246167 Not in SFARI 0.4699642
FBXO33 0.7141576 0.1880309 3 0.4510942
kable(top_genes[[3]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 22 (MTcor = -0.09)
Gene Symbol MM GS SFARI Score Relevance
EGR1 0.8816182 -0.2833232 Neuronal 0.5824707
ARC 0.8359037 -0.2857810 Neuronal 0.5608423
IER2 0.6742162 -0.2998435 Not in SFARI 0.4870298
KLF10 0.6067917 -0.3424893 Not in SFARI 0.4746405
NR4A1 0.7844912 -0.1544605 Neuronal 0.4694759
EGR2 0.7072241 -0.2164773 Neuronal 0.4618507
NR4A3 0.7458522 0.1546707 Neuronal 0.4502614
DUSP6 0.7988131 -0.0968995 Neuronal 0.4478563
NPAS4 0.7271965 0.1641813 Not in SFARI 0.4456889
CSRNP1 0.7804711 0.1010383 Not in SFARI 0.4407547
rm(create_table, i)

Top genes don’t have PC2 values as high as they did with the top genes by cluster-diagnosis correlation

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


1/30 genes are differentially expressed

5/30 genes are SFARI Genes (ZBTB16, KCND3, KMT2A, PTPN11, FBXO33)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes, but it’s not as strong as the differences in the top cluster by cluster-diagnosis correlation

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis),
                        by = c('variable'='Sample_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # # For the thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) +
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0       knitr_1.32             org.Hs.eg.db_3.8.2    
##  [4] AnnotationDbi_1.46.1   IRanges_2.18.3         S4Vectors_0.22.1      
##  [7] Biobase_2.44.0         BiocGenerics_0.30.0    DOSE_3.10.2           
## [10] ReactomePA_1.28.0      clusterProfiler_3.12.0 biomaRt_2.40.5        
## [13] polycor_0.7-10         expss_0.10.7           WGCNA_1.69            
## [16] fastcluster_1.2.3      dynamicTreeCut_1.63-1  ggExtra_0.9           
## [19] ggpubr_0.2.5           magrittr_2.0.1         GGally_1.5.0          
## [22] colorspace_2.0-2       gridExtra_2.3          viridis_0.6.1         
## [25] viridisLite_0.4.0      RColorBrewer_1.1-2     dendextend_1.15.1     
## [28] plotly_4.9.2           glue_1.4.2             reshape2_1.4.4        
## [31] forcats_0.5.0          stringr_1.4.0          dplyr_1.0.1           
## [34] purrr_0.3.4            readr_1.3.1            tidyr_1.1.0           
## [37] tibble_3.1.2           ggplot2_3.3.5          tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                  tidyselect_1.1.1           
##   [3] RSQLite_2.2.0               htmlwidgets_1.5.3          
##   [5] grid_3.6.3                  BiocParallel_1.18.1        
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] preprocessCore_1.46.0       miniUI_0.1.1.1             
##  [11] withr_2.4.2                 GOSemSim_2.10.0            
##  [13] highr_0.9                   rstudioapi_0.13            
##  [15] ggsignif_0.6.2              labeling_0.4.2             
##  [17] urltools_1.7.3              GenomeInfoDbData_1.2.1     
##  [19] polyclip_1.10-0             bit64_4.0.5                
##  [21] farver_2.1.0                vctrs_0.3.8                
##  [23] generics_0.1.0              xfun_0.25                  
##  [25] GenomeInfoDb_1.20.0         R6_2.5.1                   
##  [27] doParallel_1.0.15           graphlayouts_0.7.0         
##  [29] locfit_1.5-9.4              DelayedArray_0.10.0        
##  [31] bitops_1.0-7                cachem_1.0.6               
##  [33] reshape_0.8.8               fgsea_1.10.1               
##  [35] gridGraphics_0.5-1          assertthat_0.2.1           
##  [37] promises_1.2.0.1            scales_1.1.1               
##  [39] ggraph_2.0.3                nnet_7.3-14                
##  [41] enrichplot_1.4.0            gtable_0.3.0               
##  [43] tidygraph_1.2.0             rlang_0.4.11               
##  [45] genefilter_1.66.0           splines_3.6.3              
##  [47] lazyeval_0.2.2              acepack_1.4.1              
##  [49] impute_1.58.0               broom_0.7.0                
##  [51] europepmc_0.4               checkmate_2.0.0            
##  [53] BiocManager_1.30.16         yaml_2.2.1                 
##  [55] modelr_0.1.6                crosstalk_1.1.1            
##  [57] backports_1.2.1             httpuv_1.6.1               
##  [59] qvalue_2.16.0               Hmisc_4.4-0                
##  [61] tools_3.6.3                 ggplotify_0.1.0            
##  [63] ellipsis_0.3.2              jquerylib_0.1.4            
##  [65] ggridges_0.5.3              Rcpp_1.0.7                 
##  [67] plyr_1.8.6                  zlibbioc_1.30.0            
##  [69] base64enc_0.1-3             progress_1.2.2             
##  [71] RCurl_1.98-1.4              prettyunits_1.1.1          
##  [73] rpart_4.1-15                cowplot_1.1.1              
##  [75] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [77] ggrepel_0.9.1               cluster_2.1.0              
##  [79] fs_1.5.0                    data.table_1.14.0          
##  [81] DO.db_2.9                   triebeard_0.3.0            
##  [83] reprex_0.3.0                reactome.db_1.68.0         
##  [85] matrixStats_0.60.1          hms_1.1.0                  
##  [87] mime_0.11                   evaluate_0.14              
##  [89] xtable_1.8-4                XML_3.99-0.3               
##  [91] jpeg_0.1-9                  readxl_1.3.1               
##  [93] compiler_3.6.3              crayon_1.4.1               
##  [95] htmltools_0.5.2             later_1.3.0                
##  [97] Formula_1.2-4               geneplotter_1.62.0         
##  [99] lubridate_1.7.10            DBI_1.1.1                  
## [101] tweenr_1.0.2                dbplyr_1.4.2               
## [103] MASS_7.3-53                 rappdirs_0.3.3             
## [105] Matrix_1.2-18               cli_3.0.1                  
## [107] igraph_1.2.6                GenomicRanges_1.36.1       
## [109] pkgconfig_2.0.3             rvcheck_0.1.8              
## [111] foreign_0.8-76              xml2_1.3.2                 
## [113] foreach_1.5.1               annotate_1.62.0            
## [115] bslib_0.3.0                 XVector_0.24.0             
## [117] webshot_0.5.2               rvest_0.3.5                
## [119] yulab.utils_0.0.2           digest_0.6.27              
## [121] graph_1.62.0                rmarkdown_2.7              
## [123] cellranger_1.1.0            fastmatch_1.1-3            
## [125] htmlTable_1.13.3            curl_4.3.2                 
## [127] shiny_1.6.0                 graphite_1.30.0            
## [129] lifecycle_1.0.0             jsonlite_1.7.2             
## [131] fansi_0.5.0                 pillar_1.6.2               
## [133] lattice_0.20-41             fastmap_1.1.0              
## [135] httr_1.4.2                  survival_3.2-7             
## [137] GO.db_3.8.2                 UpSetR_1.4.0               
## [139] png_0.1-7                   iterators_1.0.13           
## [141] bit_4.0.4                   ggforce_0.3.1              
## [143] stringi_1.7.4               sass_0.4.0                 
## [145] blob_1.2.2                  DESeq2_1.24.0              
## [147] latticeExtra_0.6-29         memoise_2.0.0